{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "6df14e55",
   "metadata": {},
   "source": [
    "## Notebook 3 - Enzyme-free controls"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e85d0b35",
   "metadata": {},
   "source": [
    "This notebook removes removes any substrates found to react in the enzyme-free (lysate-containing) control experiments."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8aa2476a",
   "metadata": {},
   "outputs": [],
   "source": [
    "%run ../common.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "3476159e",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Determination of memory status is not supported on this \n",
      " platform, measuring for memoryleaks will never fail\n"
     ]
    }
   ],
   "source": [
    "from pyopenms import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "bb292eac",
   "metadata": {},
   "outputs": [],
   "source": [
    "# We already prepared the reference database\n",
    "\n",
    "df_database = pd.read_pickle(filepath_results + 'MS2_database_shifts_162_320_VB.pkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "98fbc209",
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_enzymefree(df_database, mixes):\n",
    "\n",
    "    df = pd.DataFrame(columns = ['Mix', 'Name', 'InchiKey', 'ik_MoNA', 'CSMILES', 'PrecursorMZ',\n",
    "                                 'RT', 'CosineScore','MS1_AUC'])\n",
    "\n",
    "    for mix_no in mixes:\n",
    "        \n",
    "        print(f'\\nProcessing mix {mix_no}')\n",
    "\n",
    "        ## Prepare reference\n",
    "        df_mix = df_database[df_database['Mix'] == mix_no]\n",
    "\n",
    "        ref_name = list(df_mix['Name'])\n",
    "        ref_smiles = list(df_mix['CSMILES'])\n",
    "        ref_ikmona = list(df_mix['ik_MoNA'])\n",
    "        ref_inchikey = list(df_mix['InchiKey'])\n",
    "        colnames = [a for a in df_mix.columns if 'M+' in a]\n",
    "        ref_precursors_mz = np.array(df_mix[colnames])\n",
    "        ref_fragments_mz = list(df_mix['mz'])\n",
    "        ref_fragments_i = list(df_mix['relint'])\n",
    "\n",
    "        filename = glob.glob(f\"../data/screening_data/enzymefree_controls_data/*_Mix_{mix_no}.mzML\")[0]\n",
    "\n",
    "        exp = MSExperiment()\n",
    "        MzMLFile().load(filename, exp)\n",
    "\n",
    "        ## SUBSET EXPERIMENTAL MS2 SPECTRA\n",
    "\n",
    "        list_hit_name = []\n",
    "        list_hit_mzprecursor = []\n",
    "        list_hit_rt = []\n",
    "        list_hit_score = []\n",
    "        list_hit_smiles = []\n",
    "        list_hit_ikmona = []\n",
    "        list_hit_inchikey = []\n",
    "\n",
    "        for spec in exp:\n",
    "            if spec.getMSLevel() == 2:\n",
    "\n",
    "                mz_precursor = spec.getPrecursors()[0].getMZ()\n",
    "\n",
    "                test = ((np.abs(mz_precursor - ref_precursors_mz)/mz_precursor)*1e6) < ppm # boolean 2D matrix\n",
    "                score_best = score_threshold\n",
    "                for i in np.where(test)[0]:\n",
    "                        query_mz, query_intensities = spec.get_peaks()\n",
    "                        score = cosine_greedy_score(query_mz, query_intensities, ref_fragments_mz[i], np.array(ref_fragments_i[i]))\n",
    "                        if score > score_best:\n",
    "                            score_best = score\n",
    "                            I = i\n",
    "\n",
    "                if score_best > score_threshold:\n",
    "                    list_hit_mzprecursor.append(mz_precursor)\n",
    "                    rt = spec.getRT() # seconds\n",
    "                    list_hit_rt.append(rt)\n",
    "                    list_hit_score.append(score_best)\n",
    "                    list_hit_name.append(ref_name[I])\n",
    "                    list_hit_smiles.append(ref_smiles[I])\n",
    "                    list_hit_ikmona.append(ref_ikmona[I])\n",
    "                    list_hit_inchikey.append(ref_inchikey[I])\n",
    "\n",
    "        df_temp = pd.DataFrame({\n",
    "            'File': filename,\n",
    "            'Mix': mix_no,\n",
    "            'Name': list_hit_name,\n",
    "            'CSMILES': list_hit_smiles,\n",
    "            'PrecursorMZ': np.round(list_hit_mzprecursor,4),\n",
    "            'RT': np.round(list_hit_rt, 2),\n",
    "            'CosineScore': np.round(list_hit_score,3),\n",
    "            'InchiKey': list_hit_inchikey,\n",
    "            'ik_MoNA': list_hit_ikmona,\n",
    "        })\n",
    "\n",
    "        # Let us now remove duplicate products based on their name. We keep the one with the highest score\n",
    "        df_temp.sort_values(by=['CosineScore'], ascending=[False], inplace=True) # The sorting ensures we keep the highest score when dropping\n",
    "        df_temp = df_temp.drop_duplicates(subset=['InchiKey', 'Mix'], keep='first') # We will keep the highest score for each Name\n",
    "\n",
    "        print(f'{len(df_temp)} products identified')\n",
    "\n",
    "        ## INTEGRATE IN THE MS1 CHANNEL (2D integration)\n",
    "\n",
    "        list_hit_AUC = []\n",
    "\n",
    "        t = 0.\n",
    "        for j, row in df_temp.iterrows(): \n",
    "\n",
    "            print(f\"Product {row['Name']} identified with score {row['CosineScore']}\")\n",
    "\n",
    "            rt_hit = row['RT']\n",
    "            mz_hit = row['PrecursorMZ']\n",
    "\n",
    "            AUC = []\n",
    "            t_integrate = []\n",
    "            mz_hit = list_hit_mzprecursor[j]\n",
    "            mz_ub = mz_hit*(1+ppm_integrate/1e6)\n",
    "            mz_lb = mz_hit*(1-ppm_integrate/1e6)\n",
    "            rt_lb = rt_hit - rt_semiwindow\n",
    "            rt_ub = rt_hit + rt_semiwindow\n",
    "            for spec in exp:\n",
    "                if spec.getMSLevel() == 1:\n",
    "                    rt = spec.getRT()\n",
    "                    if rt < rt_lb:\n",
    "                        continue\n",
    "                    if rt > rt_ub:\n",
    "                        break\n",
    "\n",
    "\n",
    "                    # Let us integrate along the m/z dimension to obtain the AUC for that time slice\n",
    "                    mz_peaks, i_peaks = spec.get_peaks()\n",
    "                    idx_bool = (mz_peaks > mz_lb) & (mz_peaks < mz_ub)\n",
    "                    if any(idx_bool):\n",
    "                        i_peaks_integrate = np.concatenate(([0.], i_peaks[idx_bool], [0.]))\n",
    "                        mz_peaks_integrate = np.concatenate(([mz_lb], mz_peaks[idx_bool], [mz_ub]))\n",
    "                        AUC_i = np.trapz(y=i_peaks_integrate, x=mz_peaks_integrate)\n",
    "\n",
    "                        # Let us compute the baseline AUC to be substracted\n",
    "                        #idx_where = np.where(idx_bool)[0]\n",
    "\n",
    "                        #idx_first = idx_where[0]\n",
    "                        #idx_last = idx_where[-1]\n",
    "                        #i_peaks_integrate = np.array([0., i_peaks[idx_first], i_peaks[idx_last], 0.])\n",
    "                        #mz_peaks_integrate = np.array([mz_lb, mz_peaks[idx_first], mz_peaks[idx_last], mz_ub])\n",
    "                        #AUC_base = np.trapz(y=i_peaks_integrate, x=mz_peaks_integrate)\n",
    "                        AUC_base = 0.\n",
    "\n",
    "                        t_integrate.append(rt)\n",
    "                        AUC.append(AUC_i-AUC_base)\n",
    "\n",
    "            # Let us integrate along the time dimension\n",
    "            t_integrate = np.concatenate(([rt_lb], t_integrate, [rt_ub]))\n",
    "            AUC = np.concatenate(([0.], AUC, [0.]))\n",
    "            AUC = np.trapz(y=AUC, x=t_integrate)\n",
    "            list_hit_AUC.append(AUC)\n",
    "\n",
    "        df_temp['AUC'] = np.round(list_hit_AUC,0)\n",
    "        df = pd.concat([df,df_temp], ignore_index=True)\n",
    "        mixes = set(df['Mix'])\n",
    "    return df, mixes"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "190396aa",
   "metadata": {},
   "source": [
    "We will run `process_enzymefree` iteratively until no more false positives arise in these controls.\n",
    "\n",
    "We only need to repeat those mixes for which some product was identified."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "edf31dbe",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Processing mix 1\n",
      "0 products identified\n",
      "\n",
      "Processing mix 2\n",
      "0 products identified\n",
      "\n",
      "Processing mix 3\n",
      "0 products identified\n",
      "\n",
      "Processing mix 4\n",
      "1 products identified\n",
      "Product 2,3-DIHYDROXY-4-METHOXY-4'-ETHOXYBENZOPHENONE identified with score 0.857\n",
      "\n",
      "Processing mix 5\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/v1/xrsbvyx519g764hlvcqf01p40000gq/T/ipykernel_1307/380166049.py:135: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
      "  df = pd.concat([df,df_temp], ignore_index=True)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1 products identified\n",
      "Product PHLORETIN identified with score 0.978\n",
      "\n",
      "Processing mix 6\n",
      "3 products identified\n",
      "Product S-ISOCORYDINE (+) identified with score 0.975\n",
      "Product NARINGENIN identified with score 0.971\n",
      "Product HESPERETIN identified with score 0.971\n",
      "\n",
      "Processing mix 7\n",
      "2 products identified\n",
      "Product TRYPTAMINE identified with score 0.946\n",
      "Product HARPAGOSIDE identified with score 0.894\n",
      "\n",
      "Processing mix 8\n",
      "1 products identified\n",
      "Product 7,8-DIHYDROXYFLAVONE identified with score 0.94\n",
      "\n",
      "Processing mix 9\n",
      "0 products identified\n",
      "\n",
      "Processing mix 10\n",
      "0 products identified\n",
      "\n",
      "Processing mix 11\n",
      "0 products identified\n",
      "\n",
      "Processing mix 12\n",
      "1 products identified\n",
      "Product BIOCHANIN A identified with score 0.928\n",
      "\n",
      "Processing mix 4\n",
      "0 products identified\n",
      "\n",
      "Processing mix 5\n",
      "0 products identified\n",
      "\n",
      "Processing mix 6\n",
      "0 products identified\n",
      "\n",
      "Processing mix 7\n",
      "0 products identified\n",
      "\n",
      "Processing mix 8\n",
      "0 products identified\n",
      "\n",
      "Processing mix 12\n",
      "0 products identified\n"
     ]
    }
   ],
   "source": [
    "df = pd.DataFrame()\n",
    "names_drop = set()\n",
    "mixes = range(1,13)\n",
    "\n",
    "for k in range(10):\n",
    "    df_tmp, mixes = process_enzymefree(df_database, mixes)\n",
    "    \n",
    "    if len(df_tmp) == 0:\n",
    "        break\n",
    "    \n",
    "    tmp = set(df_tmp['Name'])\n",
    "    names_drop = names_drop.union(tmp)\n",
    "    \n",
    "    df_database = df_database[~df_database['Name'].isin(names_drop)]\n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "379ccda2",
   "metadata": {},
   "source": [
    "Set of hits identified in the enzyme-free controls. These would have been false positives, so we will just discard the corresponding substrates:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "311c345b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{\"2,3-DIHYDROXY-4-METHOXY-4'-ETHOXYBENZOPHENONE\",\n",
       " '7,8-DIHYDROXYFLAVONE',\n",
       " 'BIOCHANIN A',\n",
       " 'HARPAGOSIDE',\n",
       " 'HESPERETIN',\n",
       " 'NARINGENIN',\n",
       " 'PHLORETIN',\n",
       " 'S-ISOCORYDINE (+)',\n",
       " 'TRYPTAMINE'}"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "names_drop"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8f7e4e13",
   "metadata": {},
   "source": [
    "Let us now drop the corresponding substrates from `df_substrates` and `df_database`:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "e172c8b9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(7211, 20)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_database.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "434cbde9",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_database_clean = df_database[~df_database['Name'].isin(names_drop)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "eabc19d8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(7211, 20)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_database_clean.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "a23e3686",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_database_clean.to_pickle(filepath_results + 'MS2_database_shifts_162_320_VB_clean.pkl')\n",
    "df_database_clean.to_csv(filepath_results + 'MS2_database_shifts_162_320_VB_clean.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "0c231c60",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_substrates = pd.read_csv('./tmp/Substrates_VB.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "ab423807",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(464, 17)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_substrates.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bca42055",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "957a663d",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_substrates_clean = df_substrates[~df_substrates['Name'].isin(names_drop)].copy()\n",
    "\n",
    "df_substrates_clean = df_substrates[df_substrates['Name'].isin(df_database_clean['Name'])].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "059a95b8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(444, 17)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_substrates_clean.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "e7b154e8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "434"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(df_substrates_clean['Name'].unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "fd4937a1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## QC\n",
    "all(df_substrates_clean.groupby('Name')['ik_MoNA'].nunique()==1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "dbe1c387",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "427"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_substrates_clean.ik_MoNA.nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "712ee33c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Mix</th>\n",
       "      <th>Name</th>\n",
       "      <th>InchiKey</th>\n",
       "      <th>ik_MoNA</th>\n",
       "      <th>SMILES</th>\n",
       "      <th>CSMILES</th>\n",
       "      <th>M_charge</th>\n",
       "      <th>M+Glu+H</th>\n",
       "      <th>M+Glu+Na</th>\n",
       "      <th>M+Glu+2H</th>\n",
       "      <th>M+Glu+NH4</th>\n",
       "      <th>M+ACN+H</th>\n",
       "      <th>M+2Glu+H</th>\n",
       "      <th>M+2Glu+Na</th>\n",
       "      <th>M+2Glu+2H</th>\n",
       "      <th>M+2Glu+NH4</th>\n",
       "      <th>M+2Glu+ACN+H</th>\n",
       "      <th>superclass</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>3ALPHA-HYDROXY-3-DEOXYANGOLENSIC ACID METHYL E...</td>\n",
       "      <td>QFRUZVNPYYYLAN-BBJYNIMOSA-N</td>\n",
       "      <td>QFRUZVNPYYYLAN</td>\n",
       "      <td>C[C@@]12CCC3C(=C)[C@]1(CC(=O)O[C@H]2C4=COC=C4)...</td>\n",
       "      <td>C=C1C2CCC3(C)C(c4ccoc4)OC(=O)CC13OC1CC(O)C(C)(...</td>\n",
       "      <td>0</td>\n",
       "      <td>635.306679</td>\n",
       "      <td>657.288621</td>\n",
       "      <td>318.156978</td>\n",
       "      <td>652.333226</td>\n",
       "      <td>676.333226</td>\n",
       "      <td>797.359979</td>\n",
       "      <td>819.341921</td>\n",
       "      <td>399.183628</td>\n",
       "      <td>814.386526</td>\n",
       "      <td>838.386526</td>\n",
       "      <td>Triterpenoids</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>3-AMINO-BETA-PINENE</td>\n",
       "      <td>SQSDBXYJKLVZJR-UHFFFAOYSA-N</td>\n",
       "      <td>SQSDBXYJKLVZJR</td>\n",
       "      <td>CC1(C2CC1C(=C)C(C2)N)C.Cl</td>\n",
       "      <td>C=C1C(N)CC2CC1C2(C)C</td>\n",
       "      <td>0</td>\n",
       "      <td>314.196676</td>\n",
       "      <td>336.178618</td>\n",
       "      <td>157.601976</td>\n",
       "      <td>331.223223</td>\n",
       "      <td>355.223223</td>\n",
       "      <td>476.249976</td>\n",
       "      <td>498.231918</td>\n",
       "      <td>238.628626</td>\n",
       "      <td>493.276523</td>\n",
       "      <td>517.276523</td>\n",
       "      <td>Monoterpenoids</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>3BETA-HYDROXY-23,24-BISNORCHOL-5-ENIC ACID</td>\n",
       "      <td>NPBNRBWMDNZEBN-YTEKVJICSA-N</td>\n",
       "      <td>NPBNRBWMDNZEBN</td>\n",
       "      <td>CC([C@H]1CCC2[C@@]1(CCC3C2CC=C4[C@@]3(CC[C@@H]...</td>\n",
       "      <td>CC(C(=O)O)C1CCC2C3CC=C4CC(O)CCC4(C)C3CCC12C</td>\n",
       "      <td>0</td>\n",
       "      <td>509.311371</td>\n",
       "      <td>531.293313</td>\n",
       "      <td>255.159323</td>\n",
       "      <td>526.337918</td>\n",
       "      <td>550.337918</td>\n",
       "      <td>671.364671</td>\n",
       "      <td>693.346613</td>\n",
       "      <td>336.185973</td>\n",
       "      <td>688.391218</td>\n",
       "      <td>712.391218</td>\n",
       "      <td>Steroids</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>3-HYDROXYTYRAMINE</td>\n",
       "      <td>VYFYYTLLBUKUHU-UHFFFAOYSA-N</td>\n",
       "      <td>VYFYYTLLBUKUHU</td>\n",
       "      <td>C1=CC(=C(C=C1CCN)O)O</td>\n",
       "      <td>NCCc1ccc(O)c(O)c1</td>\n",
       "      <td>0</td>\n",
       "      <td>316.139555</td>\n",
       "      <td>338.121497</td>\n",
       "      <td>158.573415</td>\n",
       "      <td>333.166102</td>\n",
       "      <td>357.166102</td>\n",
       "      <td>478.192855</td>\n",
       "      <td>500.174797</td>\n",
       "      <td>239.600065</td>\n",
       "      <td>495.219402</td>\n",
       "      <td>519.219402</td>\n",
       "      <td>Tyrosine alkaloids</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>AVOCADYNE ACETATE</td>\n",
       "      <td>JAKAZHIACKJNNB-UHFFFAOYSA-N</td>\n",
       "      <td>JAKAZHIACKJNNB</td>\n",
       "      <td>CC(=O)OCC(CC(CCCCCCCCCCCC#C)O)O</td>\n",
       "      <td>C#CCCCCCCCCCCCC(O)CC(O)COC(C)=O</td>\n",
       "      <td>0</td>\n",
       "      <td>489.306286</td>\n",
       "      <td>511.288228</td>\n",
       "      <td>245.156781</td>\n",
       "      <td>506.332833</td>\n",
       "      <td>530.332833</td>\n",
       "      <td>651.359586</td>\n",
       "      <td>673.341528</td>\n",
       "      <td>326.183431</td>\n",
       "      <td>668.386133</td>\n",
       "      <td>692.386133</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>459</th>\n",
       "      <td>12</td>\n",
       "      <td>SPERMIDINE</td>\n",
       "      <td>ATHGHQPFGPMSJY-UHFFFAOYSA-N</td>\n",
       "      <td>ATHGHQPFGPMSJY</td>\n",
       "      <td>C(CCNCCCN)CN</td>\n",
       "      <td>NCCCCNCCCN</td>\n",
       "      <td>0</td>\n",
       "      <td>308.218474</td>\n",
       "      <td>330.200416</td>\n",
       "      <td>154.612875</td>\n",
       "      <td>325.245021</td>\n",
       "      <td>349.245021</td>\n",
       "      <td>470.271774</td>\n",
       "      <td>492.253716</td>\n",
       "      <td>235.639525</td>\n",
       "      <td>487.298321</td>\n",
       "      <td>511.298321</td>\n",
       "      <td>Ornithine alkaloids</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>460</th>\n",
       "      <td>12</td>\n",
       "      <td>STEVIOL</td>\n",
       "      <td>QFVOYBUQQBFCRH-VQSWZGCSSA-N</td>\n",
       "      <td>QFVOYBUQQBFCRH</td>\n",
       "      <td>C[C@@]12CCC[C@@]([C@H]1CC[C@]34[C@H]2CC[C@](C3...</td>\n",
       "      <td>C=C1CC23CCC4C(C)(C(=O)O)CCCC4(C)C2CCC1(O)C3</td>\n",
       "      <td>0</td>\n",
       "      <td>481.280071</td>\n",
       "      <td>503.262013</td>\n",
       "      <td>241.143673</td>\n",
       "      <td>498.306618</td>\n",
       "      <td>522.306618</td>\n",
       "      <td>643.333371</td>\n",
       "      <td>665.315313</td>\n",
       "      <td>322.170323</td>\n",
       "      <td>660.359918</td>\n",
       "      <td>684.359918</td>\n",
       "      <td>Diterpenoids</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>461</th>\n",
       "      <td>12</td>\n",
       "      <td>STIGMASTEROL</td>\n",
       "      <td>HCXVJBMSMIARIN-PHZDYDNGSA-N</td>\n",
       "      <td>HCXVJBMSMIARIN</td>\n",
       "      <td>CC[C@H](/C=C/[C@@H](C)[C@H]1CC[C@@H]2[C@@]1(CC...</td>\n",
       "      <td>CCC(C=CC(C)C1CCC2C3CC=C4CC(O)CCC4(C)C3CCC12C)C...</td>\n",
       "      <td>0</td>\n",
       "      <td>575.431092</td>\n",
       "      <td>597.413034</td>\n",
       "      <td>288.219184</td>\n",
       "      <td>592.457639</td>\n",
       "      <td>616.457639</td>\n",
       "      <td>737.484392</td>\n",
       "      <td>759.466334</td>\n",
       "      <td>369.245834</td>\n",
       "      <td>754.510939</td>\n",
       "      <td>778.510939</td>\n",
       "      <td>Steroids</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>462</th>\n",
       "      <td>12</td>\n",
       "      <td>TODDALOLACTONE</td>\n",
       "      <td>GLWPLQBQHWYKRK-UHFFFAOYSA-N</td>\n",
       "      <td>GLWPLQBQHWYKRK</td>\n",
       "      <td>CC(C)(C(CC1=C(C=C2C(=C1OC)C=CC(=O)O2)OC)O)O</td>\n",
       "      <td>COc1cc2oc(=O)ccc2c(OC)c1CC(O)C(C)(C)O</td>\n",
       "      <td>0</td>\n",
       "      <td>471.186564</td>\n",
       "      <td>493.168506</td>\n",
       "      <td>236.096920</td>\n",
       "      <td>488.213111</td>\n",
       "      <td>512.213111</td>\n",
       "      <td>633.239864</td>\n",
       "      <td>655.221806</td>\n",
       "      <td>317.123570</td>\n",
       "      <td>650.266411</td>\n",
       "      <td>674.266411</td>\n",
       "      <td>Coumarins</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>463</th>\n",
       "      <td>12</td>\n",
       "      <td>TOMATINE</td>\n",
       "      <td>REJLGAUYTKNVJM-UHFFFAOYSA-N</td>\n",
       "      <td>REJLGAUYTKNVJM</td>\n",
       "      <td>CC1CCC2(C(C3C(O2)CC4C3(CCC5C4CCC6C5(CCC(C6)OC7...</td>\n",
       "      <td>CC1CCC2(NC1)OC1CC3C4CCC5CC(OC6OC(CO)C(OC7OC(CO...</td>\n",
       "      <td>0</td>\n",
       "      <td>1196.606335</td>\n",
       "      <td>1218.588277</td>\n",
       "      <td>598.806805</td>\n",
       "      <td>1213.632882</td>\n",
       "      <td>1237.632882</td>\n",
       "      <td>1358.659635</td>\n",
       "      <td>1380.641577</td>\n",
       "      <td>679.833455</td>\n",
       "      <td>1375.686182</td>\n",
       "      <td>1399.686182</td>\n",
       "      <td>Pseudoalkaloids</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>444 rows × 18 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     Mix                                               Name  \\\n",
       "0      1  3ALPHA-HYDROXY-3-DEOXYANGOLENSIC ACID METHYL E...   \n",
       "1      1                                3-AMINO-BETA-PINENE   \n",
       "2      1         3BETA-HYDROXY-23,24-BISNORCHOL-5-ENIC ACID   \n",
       "3      1                                  3-HYDROXYTYRAMINE   \n",
       "4      1                                  AVOCADYNE ACETATE   \n",
       "..   ...                                                ...   \n",
       "459   12                                         SPERMIDINE   \n",
       "460   12                                            STEVIOL   \n",
       "461   12                                       STIGMASTEROL   \n",
       "462   12                                     TODDALOLACTONE   \n",
       "463   12                                           TOMATINE   \n",
       "\n",
       "                        InchiKey         ik_MoNA  \\\n",
       "0    QFRUZVNPYYYLAN-BBJYNIMOSA-N  QFRUZVNPYYYLAN   \n",
       "1    SQSDBXYJKLVZJR-UHFFFAOYSA-N  SQSDBXYJKLVZJR   \n",
       "2    NPBNRBWMDNZEBN-YTEKVJICSA-N  NPBNRBWMDNZEBN   \n",
       "3    VYFYYTLLBUKUHU-UHFFFAOYSA-N  VYFYYTLLBUKUHU   \n",
       "4    JAKAZHIACKJNNB-UHFFFAOYSA-N  JAKAZHIACKJNNB   \n",
       "..                           ...             ...   \n",
       "459  ATHGHQPFGPMSJY-UHFFFAOYSA-N  ATHGHQPFGPMSJY   \n",
       "460  QFVOYBUQQBFCRH-VQSWZGCSSA-N  QFVOYBUQQBFCRH   \n",
       "461  HCXVJBMSMIARIN-PHZDYDNGSA-N  HCXVJBMSMIARIN   \n",
       "462  GLWPLQBQHWYKRK-UHFFFAOYSA-N  GLWPLQBQHWYKRK   \n",
       "463  REJLGAUYTKNVJM-UHFFFAOYSA-N  REJLGAUYTKNVJM   \n",
       "\n",
       "                                                SMILES  \\\n",
       "0    C[C@@]12CCC3C(=C)[C@]1(CC(=O)O[C@H]2C4=COC=C4)...   \n",
       "1                            CC1(C2CC1C(=C)C(C2)N)C.Cl   \n",
       "2    CC([C@H]1CCC2[C@@]1(CCC3C2CC=C4[C@@]3(CC[C@@H]...   \n",
       "3                                 C1=CC(=C(C=C1CCN)O)O   \n",
       "4                      CC(=O)OCC(CC(CCCCCCCCCCCC#C)O)O   \n",
       "..                                                 ...   \n",
       "459                                       C(CCNCCCN)CN   \n",
       "460  C[C@@]12CCC[C@@]([C@H]1CC[C@]34[C@H]2CC[C@](C3...   \n",
       "461  CC[C@H](/C=C/[C@@H](C)[C@H]1CC[C@@H]2[C@@]1(CC...   \n",
       "462        CC(C)(C(CC1=C(C=C2C(=C1OC)C=CC(=O)O2)OC)O)O   \n",
       "463  CC1CCC2(C(C3C(O2)CC4C3(CCC5C4CCC6C5(CCC(C6)OC7...   \n",
       "\n",
       "                                               CSMILES  M_charge      M+Glu+H  \\\n",
       "0    C=C1C2CCC3(C)C(c4ccoc4)OC(=O)CC13OC1CC(O)C(C)(...         0   635.306679   \n",
       "1                                 C=C1C(N)CC2CC1C2(C)C         0   314.196676   \n",
       "2          CC(C(=O)O)C1CCC2C3CC=C4CC(O)CCC4(C)C3CCC12C         0   509.311371   \n",
       "3                                    NCCc1ccc(O)c(O)c1         0   316.139555   \n",
       "4                      C#CCCCCCCCCCCCC(O)CC(O)COC(C)=O         0   489.306286   \n",
       "..                                                 ...       ...          ...   \n",
       "459                                         NCCCCNCCCN         0   308.218474   \n",
       "460        C=C1CC23CCC4C(C)(C(=O)O)CCCC4(C)C2CCC1(O)C3         0   481.280071   \n",
       "461  CCC(C=CC(C)C1CCC2C3CC=C4CC(O)CCC4(C)C3CCC12C)C...         0   575.431092   \n",
       "462              COc1cc2oc(=O)ccc2c(OC)c1CC(O)C(C)(C)O         0   471.186564   \n",
       "463  CC1CCC2(NC1)OC1CC3C4CCC5CC(OC6OC(CO)C(OC7OC(CO...         0  1196.606335   \n",
       "\n",
       "        M+Glu+Na    M+Glu+2H    M+Glu+NH4      M+ACN+H     M+2Glu+H  \\\n",
       "0     657.288621  318.156978   652.333226   676.333226   797.359979   \n",
       "1     336.178618  157.601976   331.223223   355.223223   476.249976   \n",
       "2     531.293313  255.159323   526.337918   550.337918   671.364671   \n",
       "3     338.121497  158.573415   333.166102   357.166102   478.192855   \n",
       "4     511.288228  245.156781   506.332833   530.332833   651.359586   \n",
       "..           ...         ...          ...          ...          ...   \n",
       "459   330.200416  154.612875   325.245021   349.245021   470.271774   \n",
       "460   503.262013  241.143673   498.306618   522.306618   643.333371   \n",
       "461   597.413034  288.219184   592.457639   616.457639   737.484392   \n",
       "462   493.168506  236.096920   488.213111   512.213111   633.239864   \n",
       "463  1218.588277  598.806805  1213.632882  1237.632882  1358.659635   \n",
       "\n",
       "       M+2Glu+Na   M+2Glu+2H   M+2Glu+NH4  M+2Glu+ACN+H           superclass  \n",
       "0     819.341921  399.183628   814.386526    838.386526        Triterpenoids  \n",
       "1     498.231918  238.628626   493.276523    517.276523       Monoterpenoids  \n",
       "2     693.346613  336.185973   688.391218    712.391218             Steroids  \n",
       "3     500.174797  239.600065   495.219402    519.219402   Tyrosine alkaloids  \n",
       "4     673.341528  326.183431   668.386133    692.386133                       \n",
       "..           ...         ...          ...           ...                  ...  \n",
       "459   492.253716  235.639525   487.298321    511.298321  Ornithine alkaloids  \n",
       "460   665.315313  322.170323   660.359918    684.359918         Diterpenoids  \n",
       "461   759.466334  369.245834   754.510939    778.510939             Steroids  \n",
       "462   655.221806  317.123570   650.266411    674.266411            Coumarins  \n",
       "463  1380.641577  679.833455  1375.686182   1399.686182      Pseudoalkaloids  \n",
       "\n",
       "[444 rows x 18 columns]"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_substrates_clean"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "e1f4c913",
   "metadata": {},
   "outputs": [],
   "source": [
    "assert set(df_database_clean['Name']) == set(df_substrates_clean['Name'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "5b4a0702",
   "metadata": {},
   "outputs": [],
   "source": [
    "assert set(df_database_clean['CSMILES']) == set(df_substrates_clean['CSMILES'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "dd7df096",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_substrates_clean.to_csv(filepath_results + 'Substrates_VB_clean.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "38699f1e",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "project_gt",
   "language": "python",
   "name": "project_gt"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
